library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(xgboost)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.5
## ✔ tidyr   0.8.1     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::as.difftime() masks base::as.difftime()
## ✖ dplyr::combine()         masks randomForest::combine()
## ✖ lubridate::date()        masks base::date()
## ✖ dplyr::filter()          masks stats::filter()
## ✖ lubridate::intersect()   masks base::intersect()
## ✖ dplyr::lag()             masks stats::lag()
## ✖ ggplot2::margin()        masks randomForest::margin()
## ✖ lubridate::setdiff()     masks base::setdiff()
## ✖ dplyr::slice()           masks xgboost::slice()
## ✖ lubridate::union()       masks base::union()
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(segmented)
source('functions.r')

load("Table_construction.Rdata") 
features = features %>%
  # Add other useful information:
  inner_join(
    data_before %>% 
      select(person_id, screening_date, people) %>%
      unnest() %>%
      select(person_id, screening_date, race, sex, name),
    by = c("person_id","screening_date")
  ) %>%
  inner_join(features_on, by = c("person_id","screening_date")) %>%
  inner_join(outcomes, by = c("person_id","screening_date")) %>%
    select(-c(offenses_within_30.y)) %>%

  # Create as many features as possible:
  mutate(
    raw_score = `Risk of Recidivism_raw_score`, # Adjust for violent/general
    decile_score = `Risk of Recidivism_decile_score`, # Adjust for violent/general
    p_jail30 = pmin(p_jail30,5),
    p_prison30 = pmin(p_jail30,5),
    p_prison = pmin(p_prison,5),
    p_probation = pmin(p_probation,5),
    race_black = if_else(race=="African-American",1,0),
    race_white = if_else(race=="Caucasian",1,0),
    race_hispanic = if_else(race=="Hispanic",1,0),
    race_asian = if_else(race=="Asian",1,0),
    race_native = if_else(race=="Native American",1,0), # race == "Other" is the baseline
    
    # Subscales:
    crim_inv = p_charge+
      p_jail30+
      p_prison+
      p_probation,
    
    # Filters (TRUE for obserations to keep)
    filt1 = `Risk of Recidivism_decile_score` != -1, `Risk of Violence_decile_score` != -1, 
    filt2 = offenses_within_30.x == 1,
    filt3 = !is.na(current_offense_date), 
    filt4 = current_offense_date <= current_offense_date_limit, 
    filt5 = p_current_age > 18 & p_current_age <= 65, 
    filt6 = crim_inv == 0  
  )

Modelling the COMPAS Risk of Recidivism score with a quadratic poly.

features_age_poly = features %>%
  filter(filt1,filt5, filt6) 


#filter out any individiuals with crim inv history on lb of age poly
lb_age = features_age_poly %>%
  group_by(p_current_age) %>%
  arrange(raw_score) %>%
  top_n(n=-1, wt=raw_score) # Fit lower bound on smallest value

mdl_age = lm(raw_score ~ 
               I(p_current_age^2) + 
               p_current_age, 
             data=lb_age)

# More precision for paper
summary(mdl_age)
## 
## Call:
## lm(formula = raw_score ~ I(p_current_age^2) + p_current_age, 
##     data = lb_age)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.22772 -0.01663  0.01004  0.02784  0.14974 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -3.580e-02  6.114e-02  -0.586     0.56    
## I(p_current_age^2)  4.634e-04  3.664e-05  12.646   <2e-16 ***
## p_current_age      -7.480e-02  3.096e-03 -24.161   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05025 on 93 degrees of freedom
## Multiple R-squared:  0.9879, Adjusted R-squared:  0.9877 
## F-statistic:  3806 on 2 and 93 DF,  p-value: < 2.2e-16
print("Coefficients:")
## [1] "Coefficients:"
sprintf("%.20e",mdl_age$coefficients) # More precision for paper
## [1] "-3.58034341549837223373e-02" "4.63390462694484011469e-04" 
## [3] "-7.48013245320391095827e-02"
## Add f(age) to features
features = features %>%
  mutate(
    f_age = predict(mdl_age, newdata=data.frame(p_current_age=p_current_age)),
    raw_score__age_poly = raw_score - f_age,
    filt7 = raw_score >= f_age - 0.05
  )
## Add same filters to lb_age 
lb_age = lb_age %>% 
    mutate(
    f_age = predict(mdl_age, newdata=data.frame(p_current_age=p_current_age)),
    raw_score__age_poly = raw_score - f_age,
    filt7 = raw_score >= f_age - 0.05
    )

Plotting settings

xmin = 18
xmax = 65
xx = seq(xmin,xmax, length.out = 1000)

Age polynomial plot

Generate a preliminary plot of age vs COMPAS general score

ggplot()+
  geom_point(aes(x=p_current_age, raw_score, color = factor(filt7)),alpha=.3, data=lb_age) +
  geom_line(aes(x=xx, predict(mdl_age, newdata=data.frame(p_current_age=xx))),color="#F8766D") +
  theme_bw()+
  xlim(xmin,xmax)+
  xlab("Age at COMPAS screening date") +
  ylab("General score") +
  theme(text = element_text(size=18),
        axis.text=element_text(size=18),
        legend.position="none")

ggplot()+
  geom_point(aes(x=p_current_age, raw_score), color="#619CFF",alpha=.3, data=features_age_poly) +
  geom_line(aes(x=xx, predict(mdl_age, newdata=data.frame(p_current_age=xx))),color="#F8766D") +
  theme_bw()+
  xlim(xmin,xmax)+
  xlab("Age at COMPAS screening date") +
  ylab("General score") +
  theme(text = element_text(size=18),
        axis.text=element_text(size=18),
        legend.position="none")

features_age_poly2 = features %>% 
    filter(filt1, filt5, filt6, filt7) 
  

lb_filt = features_age_poly2 %>%
  group_by(p_current_age) %>%
    arrange(raw_score)%>%
    top_n(n=-1, wt = raw_score)

#filtered out points 
lb_outliers = rbind(
                #reason not included in lb_filt was  
                #age not above age poly
                setdiff(lb_age, lb_filt) %>%
                  filter(!filt7) %>% #below age poly
                  mutate(reason = "Below age polynomial")
                
                ) 

lb = lb_filt %>% 
     select(p_current_age, p_age_first_offense, raw_score) %>%
     mutate(reason = "Used to construct linear splines") %>%
     rbind(lb_outliers)

Generating linear splines to fit the lower Plottng individuals on new bottom edge produced by fitting to lb_filt individuals.

mdl_age_spline <- segmented(lm(raw_score ~ p_current_age, data = lb_filt), 
                            seg.Z = ~p_current_age, psi = list(p_current_age = c(39,58)),
  control = seg.control(display = FALSE)
)
#Add Filter 8
features = features %>%
  mutate(
    age_spline = predict(mdl_age_spline, newdata=data.frame(p_current_age=p_current_age)),
    raw_score__age_spline = raw_score - age_spline,
    filt8 = raw_score >= age_spline - 0.05
  )
summary.segmented(mdl_age_spline)$psi
##                    Initial     Est.    St.Err
## psi1.p_current_age      39 32.73622 0.4637932
## psi2.p_current_age      58 49.65780 0.9842671
break1 =  summary.segmented(mdl_age_spline)$psi[1,2]
break2 =  summary.segmented(mdl_age_spline)$psi[2,2]

xx1 = seq(xmin,break1, length.out=1000)
xx2 = seq(break1,break2, length.out=1000)
xx3 = seq(break2,xmax, length.out=1000)

age_spline_recid = 
  ggplot()+
  geom_point(aes(x=p_current_age, raw_score, color= factor(reason)),alpha=.5, data=lb ) +
  scale_color_manual(values=c("red", "grey25")) + 
  geom_line(aes(x=xx1, y = predict(mdl_age_spline, newdata=data.frame(p_current_age=xx1))),
                color="darkorange1", size = 1) +
  geom_line(aes(x=xx2, y = predict(mdl_age_spline, newdata=data.frame(p_current_age=xx2))),
                color="cyan3", size = 1) +
  geom_line(aes(x=xx3, y = predict(mdl_age_spline, newdata=data.frame(p_current_age=xx3))),
                color="seagreen3", size = 1) +
  theme_bw()+
  xlim(xmin,xmax)+
  xlab("\n Age at COMPAS screening date") +
  ylab("General Score \n") +
  theme(text = element_text(size=9),
        axis.text=element_text(size=7),
        legend.position = c(.95,.95),
        legend.title = element_blank(),
        legend.justification = c("right", "top"),
        legend.key = element_rect(fill = "aliceblue"),
        legend.background = element_rect(fill="aliceblue",
                                  size=0.5, linetype="solid")
        )

age_spline_recid

ggsave("Figures/age_LB_general.pdf",width = 3.5, height = 2.5, units = "in")
# Age spline with all data (filters 1,5 still applied)
# Red points are below the age polynomial not age spline
ggplot()+
  geom_point(aes(x=p_current_age, raw_score), color="#F8766D", data=features %>% filter(filt1,filt5,!filt7)) + # Age outliers
  geom_point(aes(x=p_current_age, raw_score), color="#619CFF", alpha=.3, data=features %>% filter(filt1,filt5,filt7)) + # Not age outliers
  geom_line(aes(x=xx, predict(mdl_age_spline, newdata=data.frame(p_current_age=xx))),color="#F8766D") +
  theme_bw()+
  xlim(xmin,xmax)+
  xlab("Age at COMPAS screening date") +
  ylab("General score") +
  theme(text = element_text(size=12),
        axis.text=element_text(size=12),
        legend.position="top") 

ggsave("Figures/age_LB_alldata_general.pdf",width = 3.5, height = 2.5, units = "in")

Examine Criminal Involvment lower bound (none found)

features %>%
  filter(filt1,filt3) %>% 
  select(crim_inv, raw_score__age_spline, filt8) %>%
  ggplot() +
  geom_point(aes(x=crim_inv,y=raw_score__age_spline,color=filt8),alpha=.5) +
  xlim(c(0,70))+
  theme_bw()+
  xlab("Sum of Criminal Involvement Components") +
  ylab(expression(General~score~-~f[age]))  +
  theme(text = element_text(size=12),
        axis.text=element_text(size=12),
        legend.position="top") +
  scale_color_manual(name=element_blank(),
                       breaks=c("TRUE", "FALSE"),
                       labels=c(expression(Above~f[age]), expression(Below~f[age])),
                       values=c("TRUE"="#619CFF","FALSE"="#00BA38"))
## Warning: Removed 10 rows containing missing values (geom_point).

ggsave("Figures/crim_inv_LB_general.pdf",width = 3.5, height = 2.5, units = "in")
## Warning: Removed 10 rows containing missing values (geom_point).

Also look at just number of priors

features %>%
  filter(filt1,filt3) %>% 
  select(p_charge, raw_score__age_spline, filt8) %>%
  ggplot() +
  geom_point(aes(x=p_charge,y=raw_score__age_spline,color=filt8),alpha=.5) +
  xlim(c(0,70))+
  theme_bw()+
  xlab("Number of prior charges") +
  ylab(expression(General~score~-~f[age]))  +
  theme(text = element_text(size=12),
        axis.text=element_text(size=12),
        legend.position="top") +
  scale_color_manual(name=element_blank(),
                       breaks=c("TRUE", "FALSE"),
                       labels=c(expression(Above~f[age]), expression(Below~f[age])),
                       values=c("TRUE"="#619CFF","FALSE"="#00BA38"))
## Warning: Removed 6 rows containing missing values (geom_point).

ggsave("Figures/priors_LB_general.pdf",width = 3.5, height = 2.5, units = "in")
## Warning: Removed 6 rows containing missing values (geom_point).

Predictions of (raw - age polynomial) using several ML methods

There are a few groups of predictors we will use: * Group 1: without using age variables or race * Group 2: without using age variables but with race * Group 3: without using race but with age variables * Group 4: using age variables and race

#### Generic stuff (applies to all models)

## Filter rows
features_filt = features %>%
  filter(filt1, filt3) 

## Set parameters (each combination will be run)
# xgboost
param <- list(objective = "reg:linear",
              eval_metric = "rmse",
              eta = c(.05,.1),
              gamma = c(.5, 1), 
              max_depth = c(2,5),
              min_child_weight = c(5,10),
              subsample = c(1),
              colsample_bytree = c(1)
)

# svm
param_svm = list(
  type = 'eps-regression',
  cost = c(0.5,1,2),
  epsilon = c(0.5,1,1.5),
  gamma_scale = c(0.5,1,2)
)

res_rmse = data.frame(Group = 1:5, lm = NA, xgb = NA, rf = NA, svm = NA)

Group 1 models: predicting (raw score - age spline) without using age variables or race

### Create group 1 training data

## Select features and round count features
train = features_filt %>%
  select(
    #p_current_age,
    p_age_first_offense,
    p_charge,
    p_jail30,
    p_prison,
    p_probation,
    raw_score__age_spline)

## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-raw_score__age_spline) %>% as.matrix(),
  "label" = train %>% select(raw_score__age_spline) %>% as.matrix()
)

Model 1: Linear model

mdl_lm = lm(raw_score__age_spline ~ ., data=train)
summary(mdl_lm)
## 
## Call:
## lm(formula = raw_score__age_spline ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.70479 -0.45320 -0.06662  0.37464  2.52256 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.0500813  0.0177354  59.208   <2e-16 ***
## p_age_first_offense -0.0068003  0.0005579 -12.188   <2e-16 ***
## p_charge             0.0257307  0.0010349  24.862   <2e-16 ***
## p_jail30            -0.0075073  0.0404660  -0.186    0.853    
## p_prison             0.2073046  0.0085076  24.367   <2e-16 ***
## p_probation          0.1202088  0.0074718  16.088   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5757 on 9036 degrees of freedom
## Multiple R-squared:  0.4079, Adjusted R-squared:  0.4076 
## F-statistic:  1245 on 5 and 9036 DF,  p-value: < 2.2e-16
res_rmse[res_rmse$Group==1,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline) # ADJUST GROUP

Model 2: xgboost

set.seed(923)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  6           
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.1"       
## gamma            "0.5"       
## max_depth        "5"         
## min_child_weight "5"         
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline

res_rmse[res_rmse$Group==1,]$xgb = rmse(pred, actual) # ADJUST GROUP

axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))

data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab(expression(General~score~-~f[age])) +
  ylab("XGBoost prediction")+
  theme(
        text = element_text(size=14),
        axis.text=element_text(size=14))

### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))

Model 3: random forest

set.seed(784)

mdl_rf = randomForest(
  formula = raw_score__age_spline ~ .,
  data = train
)

res_rmse[res_rmse$Group==1,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline) # ADJUST GROUP

Model 4: SVM

mdl_svm = fit_svm(raw_score__age_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
##             11              
## type        "eps-regression"
## cost        "1"             
## epsilon     "0.5"           
## gamma_scale "1"             
## gamma       "0.1666667"
res_rmse[res_rmse$Group==1,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline) # ADJUST GROUP

Cleanup

rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf, mdl_svm)

Group 2 models: predicting (raw score - age spline) without using age variables but with race

### Create group 2 training data

## Select features and round count features
train = features_filt %>%
  select(
    #p_current_age,
    p_age_first_offense,
    p_charge,
    p_jail30,
    p_prison,
    p_probation,
    race_black,
    race_white,
    race_hispanic,
    race_asian,
    race_native, # race == "Other" is the baseline
    raw_score__age_spline)

## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-raw_score__age_spline) %>% as.matrix(),
  "label" = train %>% select(raw_score__age_spline) %>% as.matrix()
)

Model 1: Linear model

mdl_lm = lm(raw_score__age_spline ~ ., data=train)
summary(mdl_lm)
## 
## Call:
## lm(formula = raw_score__age_spline ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.53127 -0.43741 -0.06343  0.36477  2.37217 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          0.7300096  0.0298034  24.494  < 2e-16 ***
## p_age_first_offense -0.0048329  0.0005678  -8.512  < 2e-16 ***
## p_charge             0.0248485  0.0010184  24.398  < 2e-16 ***
## p_jail30             0.0062706  0.0397791   0.158  0.87475    
## p_prison             0.1974029  0.0083911  23.525  < 2e-16 ***
## p_probation          0.1170682  0.0073462  15.936  < 2e-16 ***
## race_black           0.3670750  0.0257221  14.271  < 2e-16 ***
## race_white           0.2429693  0.0259952   9.347  < 2e-16 ***
## race_hispanic        0.0877653  0.0311553   2.817  0.00486 ** 
## race_asian           0.0964541  0.0859453   1.122  0.26178    
## race_native          0.2561655  0.1097032   2.335  0.01956 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5657 on 9031 degrees of freedom
## Multiple R-squared:  0.4286, Adjusted R-squared:  0.428 
## F-statistic: 677.5 on 10 and 9031 DF,  p-value: < 2.2e-16
res_rmse[res_rmse$Group==2,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline) # ADJUST GROUP

Model 2: xgboost

set.seed(480)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  14          
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.1"       
## gamma            "0.5"       
## max_depth        "5"         
## min_child_weight "10"        
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline

res_rmse[res_rmse$Group==2,]$xgb = rmse(pred, actual) # ADJUST GROUP

axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))

data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab(expression(General~score~-~f[age])) +
  ylab("XGBoost prediction")+
  theme(
        text = element_text(size=14),
        axis.text=element_text(size=14))

data.frame(xgboost = pred, compas=features_filt$raw_score) %>%
  ggplot() +
  geom_point(aes(x=xgboost,y=compas), alpha=.3) +
  theme_bw()+
  xlab("XGBoost prediction") +
  ylab("COMPAS raw score")+
  theme(
        text = element_text(size=14),
        axis.text=element_text(size=14))

### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))

Model 3: random forest

set.seed(6778)

mdl_rf = randomForest(
  formula = raw_score__age_spline ~ .,
  data = train
)

res_rmse[res_rmse$Group==2,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline) # ADJUST GROUP

Model 4: SVM

mdl_svm = fit_svm(raw_score__age_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
##             12              
## type        "eps-regression"
## cost        "2"             
## epsilon     "0.5"           
## gamma_scale "1"             
## gamma       "0.09090909"
res_rmse[res_rmse$Group==2,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline) # ADJUST GROUP

Cleanup

rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)

Group 3 models: predicting (raw score - age spline) without using race but with age variables

### Create group 3 training data

## Select features and round count features
train = features_filt %>%
  select(
    p_current_age,
    p_age_first_offense,
    p_charge,
    p_jail30,
    p_prison,
    p_probation,
    raw_score__age_spline)

## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-raw_score__age_spline) %>% as.matrix(),
  "label" = train %>% select(raw_score__age_spline) %>% as.matrix()
)

Model 1: Linear model

mdl_lm = lm(raw_score__age_spline ~ ., data=train)
summary(mdl_lm)
## 
## Call:
## lm(formula = raw_score__age_spline ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.29274 -0.44847 -0.07196  0.36852  2.53402 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.009042   0.018161  55.561   <2e-16 ***
## p_current_age        0.011531   0.001205   9.567   <2e-16 ***
## p_age_first_offense -0.017721   0.001269 -13.961   <2e-16 ***
## p_charge             0.022992   0.001069  21.511   <2e-16 ***
## p_jail30             0.020861   0.040374   0.517    0.605    
## p_prison             0.183270   0.008830  20.755   <2e-16 ***
## p_probation          0.098896   0.007761  12.742   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5729 on 9035 degrees of freedom
## Multiple R-squared:  0.4138, Adjusted R-squared:  0.4134 
## F-statistic:  1063 on 6 and 9035 DF,  p-value: < 2.2e-16
res_rmse[res_rmse$Group==3,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline) # ADJUST GROUP

Model 2: xgboost

set.seed(999)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  15          
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.05"      
## gamma            "1"         
## max_depth        "5"         
## min_child_weight "10"        
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline

res_rmse[res_rmse$Group==3,]$xgb = rmse(pred, actual) # ADJUST GROUP

axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))

data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab(expression(General~score~-~f[age])) +
  ylab("XGBoost prediction")+
  theme(
        text = element_text(size=14),
        axis.text=element_text(size=14))

ggsave("Figures/raw-fage_xgboost_gp3_general.pdf",width = 3, height = 3, units = "in")
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))

Model 3: random forest

set.seed(5)

mdl_rf = randomForest(
  formula = raw_score__age_spline ~ .,
  data = train
)

res_rmse[res_rmse$Group==3,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline) # ADJUST GROUP

Model 4: SVM

mdl_svm = fit_svm(raw_score__age_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
##             20              
## type        "eps-regression"
## cost        "1"             
## epsilon     "0.5"           
## gamma_scale "2"             
## gamma       "0.2857143"
res_rmse[res_rmse$Group==3,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline) # ADJUST GROUP

Cleanup

rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)

Group 4 models: predicting (raw score - age spline) using age variables and race

### Create group 2 training data

## Select features and round count features
train = features_filt %>%
  select(
    p_current_age,
    p_age_first_offense,
    p_charge,
    p_jail30,
    p_prison,
    p_probation,
    race_black,
    race_white,
    race_hispanic,
    race_asian,
    race_native, # race == "Other" is the baseline
    raw_score__age_spline)

## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-raw_score__age_spline) %>% as.matrix(),
  "label" = train %>% select(raw_score__age_spline) %>% as.matrix()
)

Model 1: Linear model

mdl_lm = lm(raw_score__age_spline ~ ., data=train)
summary(mdl_lm)
## 
## Call:
## lm(formula = raw_score__age_spline ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.11700 -0.43169 -0.06356  0.36301  2.38052 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          0.689065   0.029955  23.003  < 2e-16 ***
## p_current_age        0.011449   0.001187   9.646  < 2e-16 ***
## p_age_first_offense -0.015629   0.001254 -12.466  < 2e-16 ***
## p_charge             0.022124   0.001052  21.033  < 2e-16 ***
## p_jail30             0.034811   0.039688   0.877  0.38045    
## p_prison             0.173324   0.008714  19.891  < 2e-16 ***
## p_probation          0.095996   0.007629  12.584  < 2e-16 ***
## race_black           0.367757   0.025592  14.370  < 2e-16 ***
## race_white           0.237048   0.025871   9.163  < 2e-16 ***
## race_hispanic        0.094500   0.031006   3.048  0.00231 ** 
## race_asian           0.105556   0.085516   1.234  0.21711    
## race_native          0.244428   0.109155   2.239  0.02516 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5628 on 9030 degrees of freedom
## Multiple R-squared:  0.4345, Adjusted R-squared:  0.4338 
## F-statistic: 630.6 on 11 and 9030 DF,  p-value: < 2.2e-16
res_rmse[res_rmse$Group==4,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline) # ADJUST GROUP

Model 2: xgboost

set.seed(23)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  5           
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.05"      
## gamma            "0.5"       
## max_depth        "5"         
## min_child_weight "5"         
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline

res_rmse[res_rmse$Group==4,]$xgb = rmse(pred, actual) # ADJUST GROUP

axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))

data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab(expression(General~score~-~f[age])) +
  ylab("XGBoost prediction")+
  theme(
        text = element_text(size=12),
        axis.text=element_text(size=12))

ggsave("Figures/raw-fage_xgboost_gp4_general.pdf",width = 3, height = 3, units = "in")
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))

highlight = data.frame(
  person_id= c(799, 1284, 1394, 1497, 1515, 1638, 3145, 3291, 5722, 6337, 6886, 7997, 8200, 8375, 8491, 10553, 10774, 11231, 11312, 11414),
  screening_date = ymd(c("2014-06-15","2014-05-14","2014-11-28","2013-07-29","2013-10-23","2013-10-04","2014-12-14","2013-01-17","2013-10-24","2014-02-04","2013-07-12","2014-04-26","2014-05-05","2013-03-19","2014-01-18","2014-09-20","2013-04-09","2014-02-23","2014-05-02","2014-11-26")),
  highlight = TRUE
)

df_plot = features_filt %>%
  bind_cols(xgboost = predict(mdl_xgb, newdata=train_xgb)) %>%
  left_join(highlight, by = c("person_id","screening_date")) %>%
  mutate(highlight = if_else(is.na(highlight), FALSE, TRUE)) %>%
  mutate(highlight = factor(if_else(highlight==TRUE,"In Table 5", "Not in Table 5"), levels=c("In Table 5", "Not in Table 5")))

person_id_text_topright = c(8375, 11231, 1515)
#person_id_text_topright = highlight$person_id
person_id_text_topleft = c(1394, 1497)
person_id_text_botright = c(11312, 6886, 8491, 10774)
person_id_text_botleft = c(799)

ggplot() +
  geom_point(aes(x=xgboost,y=raw_score, color=highlight),  alpha = .3, data = filter(df_plot, highlight=="Not in Table 5")) +
  geom_point(aes(x=xgboost,y=raw_score, color=highlight),  data = filter(df_plot, highlight=="In Table 5")) +
  theme_bw()+
  geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="left",vjust="bottom", data=filter(df_plot, person_id %in% person_id_text_topright & highlight=="In Table 5")) + 
  geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="right",vjust="bottom", data=filter(df_plot, person_id %in% person_id_text_topleft & highlight=="In Table 5")) + 
  geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="left",vjust="top", data=filter(df_plot, person_id %in% person_id_text_botright & highlight=="In Table 5")) + 
  geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="right",vjust="top", data=filter(df_plot, person_id %in% person_id_text_botleft & highlight=="In Table 5")) + 
  xlab(expression(XGBoost~pred.~of~general~score~-~f[age])) +
  ylab("General score")+
  theme(
    text = element_text(size=12),
    axis.text=element_text(size=12),
    #legend.position = "top",
    legend.position="none") +
  scale_color_discrete(name = element_blank()) +
  xlim(0.2,3.5)

ggsave("Figures/xgboost_rawScore_annotate_general.pdf",width = 4, height = 4, units = "in")

Model 3: random forest

set.seed(3720)

mdl_rf = randomForest(
  formula = raw_score__age_spline ~ .,
  data = train
)

res_rmse[res_rmse$Group==4,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline) # ADJUST GROUP

Model 4: SVM

mdl_svm = fit_svm(raw_score__age_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
##             20              
## type        "eps-regression"
## cost        "1"             
## epsilon     "0.5"           
## gamma_scale "2"             
## gamma       "0.1666667"
res_rmse[res_rmse$Group==4,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline) # ADJUST GROUP

Cleanup

rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)

Group 5 models: test

### Create group 5 training data

## Select features and round count features
train = features_filt %>%
  select(
    p_current_age,
    p_age_first_offense,
    p_charge,
    p_arrest,
    p_jail30,
    p_prison30,
    p_prison,
    p_probation,
    race_black,
    race_white,
    race_hispanic,
    race_asian,
    race_native, # race == "Other" is the baseline
    raw_score__age_spline)

## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-raw_score__age_spline) %>% as.matrix(),
  "label" = train %>% select(raw_score__age_spline) %>% as.matrix()
)

Model 1: Linear model

mdl_lm = lm(raw_score__age_spline ~ ., data=train)
summary(mdl_lm)
## 
## Call:
## lm(formula = raw_score__age_spline ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98368 -0.43395 -0.06177  0.36190  2.38617 
## 
## Coefficients: (1 not defined because of singularities)
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          0.692209   0.029933  23.126  < 2e-16 ***
## p_current_age        0.012003   0.001192  10.067  < 2e-16 ***
## p_age_first_offense -0.016215   0.001259 -12.875  < 2e-16 ***
## p_charge             0.013844   0.002143   6.460 1.10e-10 ***
## p_arrest             0.007263   0.001638   4.433 9.39e-06 ***
## p_jail30             0.022716   0.039741   0.572  0.56761    
## p_prison30                 NA         NA      NA       NA    
## p_prison             0.168931   0.008761  19.282  < 2e-16 ***
## p_probation          0.083112   0.008156  10.190  < 2e-16 ***
## race_black           0.368977   0.025567  14.432  < 2e-16 ***
## race_white           0.238644   0.025847   9.233  < 2e-16 ***
## race_hispanic        0.096824   0.030978   3.126  0.00178 ** 
## race_asian           0.106359   0.085428   1.245  0.21316    
## race_native          0.239245   0.109049   2.194  0.02827 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5623 on 9029 degrees of freedom
## Multiple R-squared:  0.4357, Adjusted R-squared:  0.4349 
## F-statistic: 580.9 on 12 and 9029 DF,  p-value: < 2.2e-16
res_rmse[res_rmse$Group==5,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline) # ADJUST GROUP
## Warning in predict.lm(mdl_lm, newdata = train): prediction from a rank-
## deficient fit may be misleading

Model 2: xgboost

set.seed(480)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  14          
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.1"       
## gamma            "0.5"       
## max_depth        "5"         
## min_child_weight "10"        
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline

res_rmse[res_rmse$Group==5,]$xgb = rmse(pred, actual) # ADJUST GROUP

axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))

data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab(expression(General~score~-~f[age])) +
  ylab("XGBoost prediction")+
  theme(
        text = element_text(size=14),
        axis.text=element_text(size=14))

### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))

Model 3: random forest

set.seed(1123)

mdl_rf = randomForest(
  formula = raw_score__age_spline ~ .,
  data = train
)

res_rmse[res_rmse$Group==5,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline) # ADJUST GROUP

Model 4: SVM

mdl_svm = fit_svm(raw_score__age_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
##             21              
## type        "eps-regression"
## cost        "2"             
## epsilon     "0.5"           
## gamma_scale "2"             
## gamma       "0.1428571"
res_rmse[res_rmse$Group==5,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline) # ADJUST GROUP

Cleanup

rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)

Comparison

knitr::kable(res_rmse)
Group lm xgb rf svm
1 0.5755271 0.5224769 0.5553091 0.5312492
2 0.5653542 0.5131807 0.5274510 0.5217062
3 0.5726338 0.5204973 0.5334502 0.5224062
4 0.5624640 0.5065970 0.5249744 0.5149892
5 0.5618528 0.4929722 0.5142440 0.5033342

Predictions using xgboost only

We use the “Group 3” models but predict the raw score. Results from this section can be combined with Group 3 xgboost results above where we predicted the raw score minus the age lower bound.

Predicting the raw score

### Create group 3 training data

## Select features and round count features
train = features_filt %>%
  select(
    p_current_age,
    p_age_first_offense,
    p_charge,
    p_jail30,
    p_prison,
    p_probation,
    raw_score)

## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-raw_score) %>% as.matrix(),
  "label" = train %>% select(raw_score) %>% as.matrix()
)
set.seed(540)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  5           
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.05"      
## gamma            "0.5"       
## max_depth        "5"         
## min_child_weight "5"         
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score

print(paste("RMSE:",round(rmse(pred, actual),4)))
## [1] "RMSE: 0.511"
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))

data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab("General score") +
  ylab("XGBoost prediction")+
  #annotate("text", x = -3.5, y = 0.5, label = paste("RMSE:",round(rmse(pred, actual),4)))+
  theme(
    text = element_text(size=14),
    axis.text=element_text(size=14))

ggsave("Figures/raw_xgboost_gp3_general.pdf",width = 3, height = 3, units = "in")

Replicating ProPublica logistic regression

propub = features_filt %>%
  filter(filt4) %>% # Only people with valid recidivism values
  mutate(age_low = if_else(p_current_age < 25,1,0), 
         age_high = if_else(p_current_age > 45,1,0),
         female = if_else(sex=="Female",1,0),
         n_priors = p_felony_count_person + p_misdem_count_person,
         compas_high = if_else(`Risk of Recidivism_decile_score` >= 5, 1, 0), # Medium and High risk scores get +1 label
         race = relevel(factor(race), ref="Caucasian")) # Base level is Caucasian, as in ProPublica analysis

print(paste("Number of observations for logistic regression:",nrow(propub)))
## [1] "Number of observations for logistic regression: 5759"
mdl_glm = glm(compas_high ~
                female +
                age_high +
                age_low +
                as.factor(race) +
                p_charge +
                is_misdem +
                recid,
                family=binomial(link='logit'), data=propub)

summary(mdl_glm)
## 
## Call:
## glm(formula = compas_high ~ female + age_high + age_low + as.factor(race) + 
##     p_charge + is_misdem + recid, family = binomial(link = "logit"), 
##     data = propub)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.7344  -0.7673  -0.3035   0.8381   2.6740  
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     -1.593179   0.082309 -19.356  < 2e-16 ***
## female                           0.123516   0.085171   1.450   0.1470    
## age_high                        -1.489311   0.129772 -11.476  < 2e-16 ***
## age_low                          1.445909   0.071243  20.296  < 2e-16 ***
## as.factor(race)African-American  0.521072   0.072977   7.140 9.32e-13 ***
## as.factor(race)Asian            -0.271728   0.503999  -0.539   0.5898    
## as.factor(race)Hispanic         -0.301324   0.130461  -2.310   0.0209 *  
## as.factor(race)Native American   0.390718   0.678081   0.576   0.5645    
## as.factor(race)Other            -0.713647   0.159728  -4.468 7.90e-06 ***
## p_charge                         0.155033   0.006521  23.773  < 2e-16 ***
## is_misdem                       -0.464124   0.069574  -6.671 2.54e-11 ***
## recid                            0.491790   0.068811   7.147 8.87e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7907.2  on 5758  degrees of freedom
## Residual deviance: 5665.7  on 5747  degrees of freedom
## AIC: 5689.7
## 
## Number of Fisher Scoring iterations: 5